Import packages

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Warning in library(stringr): package 'stringr' already present in search()
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/Matrix/libs/Matrix.so") ...
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/SnowballC/libs/SnowballC.so") ...
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/tokenizers/libs/tokenizers.so") ...
## Loading required package: jiebaRD
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/jiebaR/libs/jiebaR.so") ...
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Loading rda data

將會產生兩個data frames, posts and comments

針對posts 跟 comments 兩個資料框合併並做處理

Tokenization

將資料框做斷詞處理

Feature Generation

將斷詞後的資料框,查找留言是否有出現以下規則,並將其出現次數變成新的Feature,以利我們後續用PCA來降維

featured_df <- unnested_df%>%
  group_by(commentor,ccontent,ptitle,comment_id)%>%
  mutate(f1=sum(str_detect(word, "韓國瑜")))%>%
  mutate(f2=sum(str_detect(word, "韓粉")))%>%
  mutate(f3=sum(str_detect(word, "韓")))%>%
  mutate(f4=sum(str_detect(word, "韓總")))%>%
  mutate(f5=sum(str_detect(word, "蔡英文")))%>%
  mutate(f6=sum(str_detect(word, "國民黨")))%>%
  mutate(f7=sum(str_detect(word, "民進黨")))%>%
  mutate(f8=sum(str_detect(word, "綠營")))%>%
  mutate(f9=sum(str_detect(word, "藍營")))%>%
  mutate(f10=sum(str_detect(word, "蔡總統")))%>%
  mutate(f11=sum(str_detect(word, "韓市長")))%>%
  mutate(f13=sum( str_detect(word, "韓國瑜")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給")   ))%>%
  mutate(f14=sum( str_detect(word, "蔡英文")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給")   ))%>%
  mutate(f15=sum( str_detect(word, "韓國瑜")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台")   ))%>%
  mutate(f16=sum( str_detect(word, "蔡英文")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台")   ))%>%
  mutate(f17=sum( str_detect(word, "國民黨|藍營")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給")   ))%>%
  mutate(f18=sum( str_detect(word, "國民黨|藍營")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台")   ))%>%
  mutate(f19=sum( str_detect(word, "民進黨|綠營")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給")   ))%>%
  mutate(f20=sum( str_detect(word, "民進黨|綠營")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台")   ))%>%
  ungroup()

將重複的row移開,變成乾淨的資料框,以利我們後續用PCA來降維

將前面自己定義的那些Feature丟入PCA做降維

PCA

畫出各個PC的variance

將PC1排序,並發現到PC1分數越高是挺韓者,而分數越低是相對黑韓,故我們輸出分數最高前五位的回文內容與POST標題

我的做法所找出來的這些人,就文字來看確實就是挺韓者, 而我會有把握的原因為:我已經將主要會影響到是否挺韓的因素,都變成新的feature了, 這些規則一共有20個,我再透過PCA來找出最重要影響是否挺韓的主成份,並依此排序找出分數最高前五位的回文內容與POST標題。 故這樣的邏輯算是蠻有把握找出最挺韓的5個回文者。